Contents: Featurisation and Model Tuning Project
# Import all the relevant libraries needed to complete the analysis, visualization, modeling and presentation
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set_style('darkgrid')
%matplotlib inline
from scipy import stats
from scipy.stats import zscore
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn import model_selection
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import ConfusionMatrixDisplay, precision_score, recall_score
from sklearn.metrics import precision_recall_curve, roc_curve, auc, roc_auc_score
from sklearn.metrics import plot_precision_recall_curve, average_precision_score
from sklearn.metrics import f1_score, plot_roc_curve
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression, RidgeClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import fcluster
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from kmodes.kprototypes import KPrototypes
import xgboost as xgb
from xgboost import plot_importance
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTENC, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
import warnings
warnings.filterwarnings("ignore")
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
C:\Anaconda\lib\site-packages\xgboost\compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead. from pandas import MultiIndex, Int64Index
# CSV File 1
dfa=pd.read_csv('signal-data.csv')
dfa.info()
dfa.head()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1567 entries, 0 to 1566 Columns: 592 entries, Time to Pass/Fail dtypes: float64(590), int64(1), object(1) memory usage: 7.1+ MB
| Time | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | ... | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | Pass/Fail | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2008-07-19 11:55:00 | 3030.93 | 2564.00 | 2187.7333 | 1411.1265 | 1.3602 | 100.0 | 97.6133 | 0.1242 | 1.5005 | ... | NaN | 0.5005 | 0.0118 | 0.0035 | 2.3630 | NaN | NaN | NaN | NaN | -1 |
| 1 | 2008-07-19 12:32:00 | 3095.78 | 2465.14 | 2230.4222 | 1463.6606 | 0.8294 | 100.0 | 102.3433 | 0.1247 | 1.4966 | ... | 208.2045 | 0.5019 | 0.0223 | 0.0055 | 4.4447 | 0.0096 | 0.0201 | 0.0060 | 208.2045 | -1 |
| 2 | 2008-07-19 13:17:00 | 2932.61 | 2559.94 | 2186.4111 | 1698.0172 | 1.5102 | 100.0 | 95.4878 | 0.1241 | 1.4436 | ... | 82.8602 | 0.4958 | 0.0157 | 0.0039 | 3.1745 | 0.0584 | 0.0484 | 0.0148 | 82.8602 | 1 |
| 3 | 2008-07-19 14:43:00 | 2988.72 | 2479.90 | 2199.0333 | 909.7926 | 1.3204 | 100.0 | 104.2367 | 0.1217 | 1.4882 | ... | 73.8432 | 0.4990 | 0.0103 | 0.0025 | 2.0544 | 0.0202 | 0.0149 | 0.0044 | 73.8432 | -1 |
| 4 | 2008-07-19 15:22:00 | 3032.24 | 2502.87 | 2233.3667 | 1326.5200 | 1.5334 | 100.0 | 100.3967 | 0.1235 | 1.5031 | ... | NaN | 0.4800 | 0.4766 | 0.1045 | 99.3032 | 0.0202 | 0.0149 | 0.0044 | 73.8432 | -1 |
5 rows × 592 columns
# Describe function generates descriptive statistics that summarize the central tendency,
# dispersion and shape of a dataset’s distribution, excluding NaN values.
# This method tells us a lot of things about a dataset. One important thing is that
# the describe() method deals only with numeric values. It doesn't work with any
# categorical values. So if there are any categorical values in a column the describe()
# method will ignore it and display summary for the other columns.
dfa.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1561.0 | 3014.452896 | 73.621787 | 2743.2400 | 2966.260000 | 3011.4900 | 3056.6500 | 3356.3500 |
| 1 | 1560.0 | 2495.850231 | 80.407705 | 2158.7500 | 2452.247500 | 2499.4050 | 2538.8225 | 2846.4400 |
| 2 | 1553.0 | 2200.547318 | 29.513152 | 2060.6600 | 2181.044400 | 2201.0667 | 2218.0555 | 2315.2667 |
| 3 | 1553.0 | 1396.376627 | 441.691640 | 0.0000 | 1081.875800 | 1285.2144 | 1591.2235 | 3715.0417 |
| 4 | 1553.0 | 4.197013 | 56.355540 | 0.6815 | 1.017700 | 1.3168 | 1.5257 | 1114.5366 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 586 | 1566.0 | 0.021458 | 0.012358 | -0.0169 | 0.013425 | 0.0205 | 0.0276 | 0.1028 |
| 587 | 1566.0 | 0.016475 | 0.008808 | 0.0032 | 0.010600 | 0.0148 | 0.0203 | 0.0799 |
| 588 | 1566.0 | 0.005283 | 0.002867 | 0.0010 | 0.003300 | 0.0046 | 0.0064 | 0.0286 |
| 589 | 1566.0 | 99.670066 | 93.891919 | 0.0000 | 44.368600 | 71.9005 | 114.7497 | 737.3048 |
| Pass/Fail | 1567.0 | -0.867262 | 0.498010 | -1.0000 | -1.000000 | -1.0000 | -1.0000 | 1.0000 |
591 rows × 8 columns
Observations:
# Percentage of missing values
# df.isnull().sum()
# df.isna().sum()
def missing_check(df):
total = df.isnull().sum().sort_values(ascending=False) # total number of null values
percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False) # percentage of values that are null
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent']) # putting the above two together
return missing_data # return the dataframe
missing_check(dfa)
| Total | Percent | |
|---|---|---|
| 158 | 1429 | 0.911934 |
| 292 | 1429 | 0.911934 |
| 293 | 1429 | 0.911934 |
| 157 | 1429 | 0.911934 |
| 85 | 1341 | 0.855775 |
| ... | ... | ... |
| 386 | 0 | 0.000000 |
| 361 | 0 | 0.000000 |
| 360 | 0 | 0.000000 |
| 359 | 0 | 0.000000 |
| Pass/Fail | 0 | 0.000000 |
592 rows × 2 columns
def rmissingvaluecol(dff, threshold):
l = []
l = list(dff.drop(dff.loc[:,list((100*(dff.isnull().sum()/len(dff.index)) >= threshold))].columns, 1).columns.values)
print("# Columns having more than %s percent missing values: "%threshold, (dff.shape[1] - len(l)))
print("Columns:\n", list(set(list((dff.columns.values))) - set(l)))
return l
rmissingvaluecol(dfa,20) # Here threshold is 20% which means we are going to drop columns having more than 20% of missing values
# Columns having more than 20 percent missing values: 32 Columns: ['157', '580', '382', '245', '110', '385', '220', '345', '492', '247', '72', '346', '293', '358', '85', '158', '73', '579', '244', '581', '516', '246', '519', '384', '111', '578', '109', '292', '383', '518', '517', '112']
['Time', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '100', '101', '102', '103', '104', '105', '106', '107', '108', '113', '114', '115', '116', '117', '118', '119', '120', '121', '122', '123', '124', '125', '126', '127', '128', '129', '130', '131', '132', '133', '134', '135', '136', '137', '138', '139', '140', '141', '142', '143', '144', '145', '146', '147', '148', '149', '150', '151', '152', '153', '154', '155', '156', '159', '160', '161', '162', '163', '164', '165', '166', '167', '168', '169', '170', '171', '172', '173', '174', '175', '176', '177', '178', '179', '180', '181', '182', '183', '184', '185', '186', '187', '188', '189', '190', '191', '192', '193', '194', '195', '196', '197', '198', '199', '200', '201', '202', '203', '204', '205', '206', '207', '208', '209', '210', '211', '212', '213', '214', '215', '216', '217', '218', '219', '221', '222', '223', '224', '225', '226', '227', '228', '229', '230', '231', '232', '233', '234', '235', '236', '237', '238', '239', '240', '241', '242', '243', '248', '249', '250', '251', '252', '253', '254', '255', '256', '257', '258', '259', '260', '261', '262', '263', '264', '265', '266', '267', '268', '269', '270', '271', '272', '273', '274', '275', '276', '277', '278', '279', '280', '281', '282', '283', '284', '285', '286', '287', '288', '289', '290', '291', '294', '295', '296', '297', '298', '299', '300', '301', '302', '303', '304', '305', '306', '307', '308', '309', '310', '311', '312', '313', '314', '315', '316', '317', '318', '319', '320', '321', '322', '323', '324', '325', '326', '327', '328', '329', '330', '331', '332', '333', '334', '335', '336', '337', '338', '339', '340', '341', '342', '343', '344', '347', '348', '349', '350', '351', '352', '353', '354', '355', '356', '357', '359', '360', '361', '362', '363', '364', '365', '366', '367', '368', '369', '370', '371', '372', '373', '374', '375', '376', '377', '378', '379', '380', '381', '386', '387', '388', '389', '390', '391', '392', '393', '394', '395', '396', '397', '398', '399', '400', '401', '402', '403', '404', '405', '406', '407', '408', '409', '410', '411', '412', '413', '414', '415', '416', '417', '418', '419', '420', '421', '422', '423', '424', '425', '426', '427', '428', '429', '430', '431', '432', '433', '434', '435', '436', '437', '438', '439', '440', '441', '442', '443', '444', '445', '446', '447', '448', '449', '450', '451', '452', '453', '454', '455', '456', '457', '458', '459', '460', '461', '462', '463', '464', '465', '466', '467', '468', '469', '470', '471', '472', '473', '474', '475', '476', '477', '478', '479', '480', '481', '482', '483', '484', '485', '486', '487', '488', '489', '490', '491', '493', '494', '495', '496', '497', '498', '499', '500', '501', '502', '503', '504', '505', '506', '507', '508', '509', '510', '511', '512', '513', '514', '515', '520', '521', '522', '523', '524', '525', '526', '527', '528', '529', '530', '531', '532', '533', '534', '535', '536', '537', '538', '539', '540', '541', '542', '543', '544', '545', '546', '547', '548', '549', '550', '551', '552', '553', '554', '555', '556', '557', '558', '559', '560', '561', '562', '563', '564', '565', '566', '567', '568', '569', '570', '571', '572', '573', '574', '575', '576', '577', '582', '583', '584', '585', '586', '587', '588', '589', 'Pass/Fail']
l = rmissingvaluecol(dfa, 20)
dfa = dfa[l]
# Columns having more than 20 percent missing values: 32 Columns: ['157', '580', '382', '245', '110', '385', '220', '345', '492', '247', '72', '346', '293', '358', '85', '158', '73', '579', '244', '581', '516', '246', '519', '384', '111', '578', '109', '292', '383', '518', '517', '112']
dfa.info()
dfa.head()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1567 entries, 0 to 1566 Columns: 560 entries, Time to Pass/Fail dtypes: float64(558), int64(1), object(1) memory usage: 6.7+ MB
| Time | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | ... | 577 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | Pass/Fail | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2008-07-19 11:55:00 | 3030.93 | 2564.00 | 2187.7333 | 1411.1265 | 1.3602 | 100.0 | 97.6133 | 0.1242 | 1.5005 | ... | 14.9509 | 0.5005 | 0.0118 | 0.0035 | 2.3630 | NaN | NaN | NaN | NaN | -1 |
| 1 | 2008-07-19 12:32:00 | 3095.78 | 2465.14 | 2230.4222 | 1463.6606 | 0.8294 | 100.0 | 102.3433 | 0.1247 | 1.4966 | ... | 10.9003 | 0.5019 | 0.0223 | 0.0055 | 4.4447 | 0.0096 | 0.0201 | 0.0060 | 208.2045 | -1 |
| 2 | 2008-07-19 13:17:00 | 2932.61 | 2559.94 | 2186.4111 | 1698.0172 | 1.5102 | 100.0 | 95.4878 | 0.1241 | 1.4436 | ... | 9.2721 | 0.4958 | 0.0157 | 0.0039 | 3.1745 | 0.0584 | 0.0484 | 0.0148 | 82.8602 | 1 |
| 3 | 2008-07-19 14:43:00 | 2988.72 | 2479.90 | 2199.0333 | 909.7926 | 1.3204 | 100.0 | 104.2367 | 0.1217 | 1.4882 | ... | 8.5831 | 0.4990 | 0.0103 | 0.0025 | 2.0544 | 0.0202 | 0.0149 | 0.0044 | 73.8432 | -1 |
| 4 | 2008-07-19 15:22:00 | 3032.24 | 2502.87 | 2233.3667 | 1326.5200 | 1.5334 | 100.0 | 100.3967 | 0.1235 | 1.5031 | ... | 10.9698 | 0.4800 | 0.4766 | 0.1045 | 99.3032 | 0.0202 | 0.0149 | 0.0044 | 73.8432 | -1 |
5 rows × 560 columns
dfa.isnull().sum()
Time 0
0 6
1 7
2 14
3 14
..
586 1
587 1
588 1
589 1
Pass/Fail 0
Length: 560, dtype: int64
Absence of a signal is assumed to be no signal in the dataset:
# Replace the NaN/NA with mean, median or zero (considering it as no signal)
# dfa.fillna(dfa.mean(),inplace = True)
dfa.fillna(0,inplace=True)
# Again, checking if there is any NULL values left
dfa.isnull().any().any()
False
# Drop the columns that have constant signal
cols = dfa.select_dtypes([np.number]).columns
std = dfa[cols].std()
cols_to_drop = std[std==0].index
dfa.drop(cols_to_drop, axis=1,inplace=True)
dfa.head()
| Time | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | ... | 577 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | Pass/Fail | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2008-07-19 11:55:00 | 3030.93 | 2564.00 | 2187.7333 | 1411.1265 | 1.3602 | 100.0 | 97.6133 | 0.1242 | 1.5005 | ... | 14.9509 | 0.5005 | 0.0118 | 0.0035 | 2.3630 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | -1 |
| 1 | 2008-07-19 12:32:00 | 3095.78 | 2465.14 | 2230.4222 | 1463.6606 | 0.8294 | 100.0 | 102.3433 | 0.1247 | 1.4966 | ... | 10.9003 | 0.5019 | 0.0223 | 0.0055 | 4.4447 | 0.0096 | 0.0201 | 0.0060 | 208.2045 | -1 |
| 2 | 2008-07-19 13:17:00 | 2932.61 | 2559.94 | 2186.4111 | 1698.0172 | 1.5102 | 100.0 | 95.4878 | 0.1241 | 1.4436 | ... | 9.2721 | 0.4958 | 0.0157 | 0.0039 | 3.1745 | 0.0584 | 0.0484 | 0.0148 | 82.8602 | 1 |
| 3 | 2008-07-19 14:43:00 | 2988.72 | 2479.90 | 2199.0333 | 909.7926 | 1.3204 | 100.0 | 104.2367 | 0.1217 | 1.4882 | ... | 8.5831 | 0.4990 | 0.0103 | 0.0025 | 2.0544 | 0.0202 | 0.0149 | 0.0044 | 73.8432 | -1 |
| 4 | 2008-07-19 15:22:00 | 3032.24 | 2502.87 | 2233.3667 | 1326.5200 | 1.5334 | 100.0 | 100.3967 | 0.1235 | 1.5031 | ... | 10.9698 | 0.4800 | 0.4766 | 0.1045 | 99.3032 | 0.0202 | 0.0149 | 0.0044 | 73.8432 | -1 |
5 rows × 448 columns
dfa.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1567 entries, 0 to 1566 Columns: 448 entries, Time to Pass/Fail dtypes: float64(446), int64(1), object(1) memory usage: 5.4+ MB
# Time is the id of the customer with corresponding details. This information may not be requried
# for analysis and modeling as the Time will be all unique values. So we can drop it safely.
dfa.drop(['Time'], axis=1, inplace=True)
# Label encode the target class with 0 and 1
dfa['Pass/Fail']=dfa['Pass/Fail'].replace([-1,1],[0,1])
# Check for correlation and consider the features where correlation coeff > 0.7
plt.figure(figsize=(20,18))
corr=dfa.corr()
sns.heatmap(abs(corr>0.7),cmap="Reds");
# Make a copy of the dataset and drop the target class for easy EDA
dfa1=dfa.copy()
dfa1.drop(['Pass/Fail'],axis=1,inplace=True)
# Create correlation matrix
corr_matrix = dfa1.corr().abs()
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
# Select features with correlation greater than 0.70
to_drop = [column for column in upper.columns if any(upper[column] > 0.70)]
# Drop features
dfa1.drop(to_drop, axis=1, inplace=True)
row,column=dfa1.shape
print('After dropping the correlated features the dataset contains:', row, 'rows and', column, 'columns')
After dropping the correlated features the dataset contains: 1567 rows and 184 columns
# Use boxplot to check for outliers
plt.figure(figsize=(50, 50))
col = 1
for i in dfa1.columns:
plt.subplot(22, 10, col)
sns.boxplot(dfa1[i],color='green')
col += 1
# Replace the outliers with median
for i in dfa1.columns:
q1 = dfa1[i].quantile(0.25)
q3 = dfa1[i].quantile(0.75)
iqr = q3 - q1
low = q1 - 1.5 * iqr
high = q3 + 1.5 * iqr
dfa1.loc[(dfa1[i] < low) | (dfa1[i] > high), i] = dfa1[i].median()
# Situation after removing the outliers with median
plt.figure(figsize=(50, 50))
col = 1
for i in dfa1.columns:
plt.subplot(22, 10, col)
sns.boxplot(dfa1[i],color='green')
col += 1
# Check for distribution, skewness
dfa1.hist(bins = 30, figsize = (40, 40), color = 'green')
plt.show()
# Density plot to check for the distribution of features
plt.figure(figsize=(40, 40))
col = 1
for i in dfa1.columns:
plt.subplot(22, 10, col)
sns.distplot(dfa1[i], color = 'g')
col += 1
# Combine the dataset
y=dfa['Pass/Fail']
dfa1=pd.concat([dfa1,y],axis=1)
dfa1.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1567 entries, 0 to 1566 Columns: 185 entries, 0 to Pass/Fail dtypes: float64(184), int64(1) memory usage: 2.2 MB
# Correlation of "Pass/Fail" with other features
# Open image in a new tab for details
plt.figure(figsize=(60,30))
dfa1.corr()['Pass/Fail'].sort_values(ascending = False).plot(kind='bar')
<AxesSubplot:>
# As is evident, we may consider to drop the following features: '224','432','53','253','82','119','221'.
# dfa1.drop(['224','432','53','253','82','119','221'], axis=1, inplace=True)
# Understand the target variable and check for imbalanced dataset
f,axes=plt.subplots(1,2,figsize=(17,7))
dfa1['Pass/Fail'].value_counts().plot.pie(autopct='%1.1f%%',ax=axes[0])
sns.countplot('Pass/Fail',data=dfa1,ax=axes[1])
axes[0].set_title('Response Variable Pie Chart')
axes[1].set_title('Response Variable Bar Graph')
plt.show()
# Group datapoints by class
dfa1.groupby(["Pass/Fail"]).count()
| 0 | 1 | 2 | 3 | 4 | 8 | 9 | 10 | 11 | 14 | ... | 562 | 565 | 570 | 571 | 572 | 582 | 583 | 586 | 587 | 589 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Pass/Fail | |||||||||||||||||||||
| 0 | 1463 | 1463 | 1463 | 1463 | 1463 | 1463 | 1463 | 1463 | 1463 | 1463 | ... | 1463 | 1463 | 1463 | 1463 | 1463 | 1463 | 1463 | 1463 | 1463 | 1463 |
| 1 | 104 | 104 | 104 | 104 | 104 | 104 | 104 | 104 | 104 | 104 | ... | 104 | 104 | 104 | 104 | 104 | 104 | 104 | 104 | 104 | 104 |
2 rows × 184 columns
Insights from above graphs:
There is big imbalance in the target vector.
If the imbalanced data is not treated beforehand, then this will degrade the performance of the ML model. Most of the predictions will correspond to the majority class and treat the minority class of features as noise in the data and ignore them. This results in a high bias and low performance of the model.
A widely adopted technique for dealing with highly unbalanced datasets is called re-sampling.
Two widely used re-sampling methods are:
# Visualize a jointplot for ‘8’ and ‘9’ and share insights.
sns.jointplot(data = dfa1, x="8", y="9", kind = "reg")
<seaborn.axisgrid.JointGrid at 0x27ccd1aed90>
Observations:
# Print the correlation coefficient between every pair of attributes
dfa1.corr()
| 0 | 1 | 2 | 3 | 4 | 8 | 9 | 10 | 11 | 14 | ... | 565 | 570 | 571 | 572 | 582 | 583 | 586 | 587 | 589 | Pass/Fail | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.000000 | -0.169097 | -0.001293 | 0.012024 | -0.032580 | -0.060591 | 0.024371 | 0.004452 | 0.044247 | 0.010903 | ... | 0.010564 | -0.067369 | 0.009589 | 0.035686 | 0.017238 | -0.026653 | -0.008951 | 0.008001 | -0.040191 | -0.062225 |
| 1 | -0.169097 | 1.000000 | 0.019656 | -0.015111 | 0.030349 | 0.032813 | 0.026130 | 0.005400 | -0.042919 | -0.045412 | ... | -0.015517 | 0.014864 | -0.045472 | 0.029182 | 0.035892 | 0.024358 | -0.017417 | 0.003141 | -0.002796 | -0.006868 |
| 2 | -0.001293 | 0.019656 | 1.000000 | 0.436001 | 0.148285 | 0.036864 | 0.015772 | 0.061970 | 0.045079 | -0.001856 | ... | 0.030394 | -0.074167 | -0.033315 | 0.012738 | 0.006126 | 0.003601 | -0.046442 | -0.036837 | -0.005474 | -0.034052 |
| 3 | 0.012024 | -0.015111 | 0.436001 | 1.000000 | 0.241856 | -0.000353 | 0.061426 | -0.002850 | 0.012644 | -0.090695 | ... | 0.061169 | -0.045090 | 0.009404 | 0.082516 | -0.019223 | -0.017588 | 0.011161 | -0.068373 | -0.045283 | -0.007574 |
| 4 | -0.032580 | 0.030349 | 0.148285 | 0.241856 | 1.000000 | -0.011200 | 0.016734 | -0.015835 | -0.005608 | 0.009619 | ... | 0.059144 | -0.021284 | 0.005597 | 0.038459 | 0.047429 | -0.012838 | 0.017337 | -0.007995 | 0.020781 | -0.011093 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 583 | -0.026653 | 0.024358 | 0.003601 | -0.017588 | -0.012838 | 0.012550 | -0.027123 | 0.004848 | 0.003763 | 0.010671 | ... | -0.006052 | 0.053395 | -0.027082 | -0.015064 | 0.001926 | 1.000000 | 0.011638 | 0.004284 | -0.002726 | -0.011167 |
| 586 | -0.008951 | -0.017417 | -0.046442 | 0.011161 | 0.017337 | 0.021318 | 0.011284 | -0.006049 | 0.001234 | -0.030707 | ... | 0.014872 | -0.020474 | -0.023540 | 0.013773 | -0.039897 | 0.011638 | 1.000000 | 0.048626 | -0.333710 | 0.010428 |
| 587 | 0.008001 | 0.003141 | -0.036837 | -0.068373 | -0.007995 | 0.048311 | 0.042630 | 0.026258 | -0.011129 | 0.007730 | ... | 0.029338 | 0.057177 | -0.018802 | 0.033287 | -0.035917 | 0.004284 | 0.048626 | 1.000000 | 0.415632 | 0.047873 |
| 589 | -0.040191 | -0.002796 | -0.005474 | -0.045283 | 0.020781 | 0.011978 | 0.003055 | 0.058635 | -0.018370 | 0.061740 | ... | -0.018287 | 0.060452 | 0.014878 | -0.001036 | 0.002756 | -0.002726 | -0.333710 | 0.415632 | 1.000000 | 0.026850 |
| Pass/Fail | -0.062225 | -0.006868 | -0.034052 | -0.007574 | -0.011093 | 0.025264 | -0.023599 | 0.029178 | -0.055440 | -0.065402 | ... | -0.024595 | 0.016800 | 0.040202 | 0.016508 | 0.049542 | -0.011167 | 0.010428 | 0.047873 | 0.026850 | 1.000000 |
185 rows × 185 columns
# Checking Correlation Heatmap
# Open image in a new tab for details
plt.figure(dpi = 300,figsize= (100,90))
mask = np.triu(np.ones_like(dfa1.corr()))
sns.heatmap(dfa1.corr(),mask = mask, fmt = ".2f",annot=True,lw=1,cmap = 'plasma')
plt.yticks(rotation = 0)
plt.xticks(rotation = 90)
plt.title('Correlation Heatmap')
plt.show()